From 79e4b2910a4c3ee7736885a630c51ec5a882c3cf Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Mon, 27 Feb 2006 15:52:43 +0100 Subject: [PATCH] Add a compile time option to enable domain 0 running in ring 0. In this mode only a single guest kernel is supported. This mode only works for x86/32 (not x86/64). Signed-off-by: Ian Campbell Signed-off-by: Keir Fraser --- xen/arch/x86/Makefile | 4 + xen/arch/x86/Rules.mk | 4 + xen/arch/x86/domain.c | 18 ++- xen/arch/x86/domain_build.c | 11 ++ xen/arch/x86/traps.c | 2 +- xen/arch/x86/x86_32/asm-offsets.c | 7 + xen/arch/x86/x86_32/entry.S | 19 +++ xen/arch/x86/x86_32/mm.c | 17 ++- xen/arch/x86/x86_32/supervisor_mode_kernel.S | 145 +++++++++++++++++++ xen/arch/x86/x86_32/traps.c | 61 +++++++- xen/arch/x86/x86_64/mm.c | 4 +- xen/common/dom0_ops.c | 7 + xen/common/kernel.c | 2 + xen/include/asm-ia64/config.h | 2 + xen/include/asm-x86/config.h | 6 + xen/include/asm-x86/desc.h | 15 +- xen/include/asm-x86/x86_32/asm_defns.h | 19 ++- 17 files changed, 325 insertions(+), 18 deletions(-) create mode 100644 xen/arch/x86/x86_32/supervisor_mode_kernel.S diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile index 8e93283588..30b391368b 100644 --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -33,6 +33,10 @@ ifeq ($(TARGET_SUBARCH),x86_32) endif endif +ifneq ($(supervisor_mode_kernel),y) +OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS)) +endif + OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS)) OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS)) diff --git a/xen/arch/x86/Rules.mk b/xen/arch/x86/Rules.mk index 79670678b7..f3597ee722 100644 --- a/xen/arch/x86/Rules.mk +++ b/xen/arch/x86/Rules.mk @@ -6,6 +6,7 @@ # 'make clean' before rebuilding. # pae ?= n +supervisor_mode_kernel ?= n CFLAGS += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing CFLAGS += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe @@ -32,6 +33,9 @@ ifeq ($(pae),y) CFLAGS += -DCONFIG_X86_PAE=1 endif endif +ifeq ($(supervisor_mode_kernel),y) +CFLAGS += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1 +endif ifeq ($(TARGET_SUBARCH),x86_64) CFLAGS += -m64 -mno-red-zone -fpic -fno-reorder-blocks diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 4f7da5a96c..df61b9b9aa 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -351,17 +351,17 @@ int arch_set_info_guest( if ( !(c->flags & VGCF_HVM_GUEST) ) { - fixup_guest_selector(c->user_regs.ss); - fixup_guest_selector(c->kernel_ss); - fixup_guest_selector(c->user_regs.cs); + fixup_guest_stack_selector(c->user_regs.ss); + fixup_guest_stack_selector(c->kernel_ss); + fixup_guest_code_selector(c->user_regs.cs); #ifdef __i386__ - fixup_guest_selector(c->event_callback_cs); - fixup_guest_selector(c->failsafe_callback_cs); + fixup_guest_code_selector(c->event_callback_cs); + fixup_guest_code_selector(c->failsafe_callback_cs); #endif for ( i = 0; i < 256; i++ ) - fixup_guest_selector(c->trap_ctxt[i].cs); + fixup_guest_code_selector(c->trap_ctxt[i].cs); } else if ( !hvm_enabled ) return -EINVAL; @@ -847,7 +847,11 @@ unsigned long __hypercall_create_continuation( regs = guest_cpu_user_regs(); #if defined(__i386__) regs->eax = op; - regs->eip -= 2; /* re-execute 'int 0x82' */ + + if ( supervisor_mode_kernel ) + regs->eip &= ~31; /* re-execute entire hypercall entry stub */ + else + regs->eip -= 2; /* re-execute 'int 0x82' */ for ( i = 0; i < nr_args; i++ ) { diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index c6dd10baa0..361ca2485e 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -793,6 +793,17 @@ int construct_dom0(struct domain *d, update_pagetables(v); } + if ( supervisor_mode_kernel ) + { + v->arch.guest_context.kernel_ss &= ~3; + v->arch.guest_context.user_regs.ss &= ~3; + v->arch.guest_context.user_regs.es &= ~3; + v->arch.guest_context.user_regs.ds &= ~3; + v->arch.guest_context.user_regs.fs &= ~3; + v->arch.guest_context.user_regs.gs &= ~3; + printk("Dom0 runs in ring 0 (supervisor mode)\n"); + } + rc = 0; /* DOM0 is permitted full I/O capabilities. */ diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 109c9fd516..bd590c70ce 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -1429,7 +1429,7 @@ long do_set_trap_table(struct trap_info *traps) if ( cur.address == 0 ) break; - fixup_guest_selector(cur.cs); + fixup_guest_code_selector(cur.cs); memcpy(&dst[cur.vector], &cur, sizeof(cur)); diff --git a/xen/arch/x86/x86_32/asm-offsets.c b/xen/arch/x86/x86_32/asm-offsets.c index a7e970b92b..960a52cb74 100644 --- a/xen/arch/x86/x86_32/asm-offsets.c +++ b/xen/arch/x86/x86_32/asm-offsets.c @@ -72,6 +72,13 @@ void __dummy__(void) DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked); BLANK(); + OFFSET(TSS_ss0, struct tss_struct, ss0); + OFFSET(TSS_esp0, struct tss_struct, esp0); + OFFSET(TSS_ss1, struct tss_struct, ss1); + OFFSET(TSS_esp1, struct tss_struct, esp1); + DEFINE(TSS_sizeof, sizeof(struct tss_struct)); + BLANK(); + OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa); OFFSET(VCPU_svm_hsa_pa, struct vcpu, arch.hvm_svm.host_save_pa); OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb); diff --git a/xen/arch/x86/x86_32/entry.S b/xen/arch/x86/x86_32/entry.S index db194c6c41..6e888f1e9c 100644 --- a/xen/arch/x86/x86_32/entry.S +++ b/xen/arch/x86/x86_32/entry.S @@ -77,6 +77,13 @@ restore_all_guest: testl $X86_EFLAGS_VM,UREGS_eflags(%esp) jnz restore_all_vm86 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL + testl $2,UREGS_cs(%esp) + jnz 1f + call restore_ring0_guest + jmp restore_all_vm86 +1: +#endif FLT1: mov UREGS_ds(%esp),%ds FLT2: mov UREGS_es(%esp),%es FLT3: mov UREGS_fs(%esp),%fs @@ -157,6 +164,7 @@ restore_all_xen: ALIGN ENTRY(hypercall) subl $4,%esp + FIXUP_RING0_GUEST_STACK SAVE_ALL(b) sti GET_CURRENT(%ebx) @@ -294,6 +302,11 @@ FLT14: movl %eax,%gs:(%esi) popl %eax shll $16,%eax # Bits 16-23: saved_upcall_mask movw UREGS_cs+4(%esp),%ax # Bits 0-15: CS +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL + testw $2,%ax + jnz FLT15 + and $~3,%ax # RPL 1 -> RPL 0 +#endif FLT15: movl %eax,%gs:4(%esi) test $0x00FF0000,%eax # Bits 16-23: saved_upcall_mask setz %ch # %ch == !saved_upcall_mask @@ -388,6 +401,7 @@ ENTRY(divide_error) pushl $TRAP_divide_error<<16 ALIGN error_code: + FIXUP_RING0_GUEST_STACK SAVE_ALL_NOSEGREGS(a) SET_XEN_SEGMENTS(a) testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp) @@ -505,6 +519,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code ENTRY(nmi) +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL + # NMI entry protocol is incompatible with guest kernel in ring 0. + iret +#else # Save state but do not trash the segment registers! # We may otherwise be unable to reload them or copy them to ring 1. pushl %eax @@ -546,6 +564,7 @@ defer_nmi: movl $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \ TRAP_deferred_nmi),%ss:APIC_ICR(%eax) jmp restore_all_xen +#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */ ENTRY(setup_vm86_frame) # Copies the entire stack frame forwards by 16 bytes. diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index d279204486..70dfe77bab 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -180,6 +180,15 @@ void subarch_init_memory(struct domain *dom_xen) page_set_owner(page, dom_xen); } } + + if ( supervisor_mode_kernel ) + { + /* Guest kernel runs in ring 0, not ring 1. */ + struct desc_struct *d; + d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; + d[0].b &= ~_SEGMENT_DPL; + d[1].b &= ~_SEGMENT_DPL; + } } long subarch_memory_op(int op, void *arg) @@ -223,7 +232,7 @@ long do_stack_switch(unsigned long ss, unsigned long esp) int nr = smp_processor_id(); struct tss_struct *t = &init_tss[nr]; - fixup_guest_selector(ss); + fixup_guest_stack_selector(ss); current->arch.guest_context.kernel_ss = ss; current->arch.guest_context.kernel_sp = esp; @@ -240,6 +249,10 @@ int check_descriptor(struct desc_struct *d) u32 a = d->a, b = d->b; u16 cs; + /* Let a ring0 guest kernel set any descriptor it wants to. */ + if ( supervisor_mode_kernel ) + return 1; + /* A not-present descriptor will always fault, so is safe. */ if ( !(b & _SEGMENT_P) ) goto good; @@ -273,7 +286,7 @@ int check_descriptor(struct desc_struct *d) /* Validate and fix up the target code selector. */ cs = a >> 16; - fixup_guest_selector(cs); + fixup_guest_code_selector(cs); if ( !guest_gate_selector_okay(cs) ) goto bad; a = d->a = (d->a & 0xffffU) | (cs << 16); diff --git a/xen/arch/x86/x86_32/supervisor_mode_kernel.S b/xen/arch/x86/x86_32/supervisor_mode_kernel.S new file mode 100644 index 0000000000..82f823bfa7 --- /dev/null +++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S @@ -0,0 +1,145 @@ +/* + * Handle stack fixup for guest running in RING 0. + * + * Copyright (c) 2006 Ian Campbell + * + * When a guest kernel is allowed to run in RING 0 a hypercall, + * interrupt or exception interrupting the guest kernel will not cause + * a privilege level change and therefore the stack will not be swapped + * to the Xen stack. + * + * To fix this we look for RING 0 activation frames with a stack + * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel + * frame) and fix this up by locating the Xen stack via the TSS + * and moving the activation frame to the Xen stack. In the process we + * convert the frame into an inter-privilege frame returning to RING 1 + * so that we can catch and reverse the process on exit. + */ + +#include +#include +#include + + # Upon entry the stack should be the Xen stack and contain: + # %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN + # On exit the stack should be %ss:%esp (i.e. the guest stack) + # and contain: + # EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN + ALIGN +ENTRY(restore_ring0_guest) + # Point %gs:%esi to guest stack. +RRG0: movw UREGS_ss+4(%esp),%gs + movl UREGS_esp+4(%esp),%esi + + # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack. + movl $(UREGS_kernel_sizeof>>2)+1,%ecx + +1: subl $4,%esi + movl -4(%esp,%ecx,4),%eax +RRG1: movl %eax,%gs:(%esi) + loop 1b + +RRG2: andl $~3,%gs:UREGS_cs+4(%esi) + + movl %gs,%eax + + # We need to do this because these registers are not present + # on the guest stack so they cannot be restored by the code in + # restore_all_guest. +RRG3: mov UREGS_ds+4(%esp),%ds +RRG4: mov UREGS_es+4(%esp),%es +RRG5: mov UREGS_fs+4(%esp),%fs +RRG6: mov UREGS_gs+4(%esp),%gs + +RRG7: movl %eax,%ss + movl %esi,%esp + + ret +.section __ex_table,"a" + .long RRG0,domain_crash_synchronous + .long RRG1,domain_crash_synchronous + .long RRG2,domain_crash_synchronous + .long RRG3,domain_crash_synchronous + .long RRG4,domain_crash_synchronous + .long RRG5,domain_crash_synchronous + .long RRG6,domain_crash_synchronous + .long RRG7,domain_crash_synchronous +.previous + + # Upon entry the stack should be a guest stack and contain: + # EFLAGS, %cs, %eip, ERROR, RETURN + # On exit the stack should be the Xen stack and contain: + # %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN + ALIGN +ENTRY(fixup_ring0_guest_stack) + pushl %eax + pushl %ecx + pushl %ds + pushl %gs + pushl %esi + + movw $__HYPERVISOR_DS,%ax + movw %ax,%ds + + # Point %gs:%esi to guest stack frame. + movw %ss,%ax + movw %ax,%gs + movl %esp,%esi + # Account for entries on the guest stack: + # * Pushed by normal exception/interrupt/hypercall mechanisms + # * EFLAGS, %cs, %eip, ERROR == 4 words. + # * Pushed by the fixup routine + # * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words. + addl $((6+4)*4),%esi + + # %gs:%esi now points to the guest stack before the + # interrupt/exception occured. + + /* + * Reverse the __TSS macro, giving us the CPU number. + * The TSS for this cpu is at init_tss + ( cpu * 128 ). + */ + str %ecx + shrl $3,%ecx # Calculate GDT index for TSS. + subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu. + shll $6,%ecx # Each TSS entry is 0x80 bytes + addl $init_tss,%ecx # but we have 2*cpu from above. + + # Load Xen stack from TSS. + movw TSS_ss0(%ecx),%ax +TRP1: movw %ax,%ss + movl TSS_esp0(%ecx),%esp + + pushl %gs + pushl %esi + + # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi + # from guest stack to Xen stack. + movl $10,%ecx +1: subl $4,%esp + subl $4,%esi +TRP2: movl %gs:(%esi),%eax + movl %eax,(%esp) + loop 1b + + # CS = CS|1 to simulate RING1 stack frame. + orl $1,32(%esp) + + popl %esi + popl %gs + popl %ds + popl %ecx + popl %eax + ret +.section __ex_table,"a" + .long TRP1,domain_crash_synchronous + .long TRP2,domain_crash_synchronous +.previous + +domain_crash_synchronous_string: + .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n" + +domain_crash_synchronous: + pushl $domain_crash_synchronous_string + call printf + jmp __domain_crash_synchronous diff --git a/xen/arch/x86/x86_32/traps.c b/xen/arch/x86/x86_32/traps.c index 51c8e9b7de..0e2af9f4c1 100644 --- a/xen/arch/x86/x86_32/traps.c +++ b/xen/arch/x86/x86_32/traps.c @@ -256,8 +256,14 @@ void init_int80_direct_trap(struct vcpu *v) * We can't virtualise interrupt gates, as there's no way to get * the CPU to automatically clear the events_mask variable. Also we * must ensure that the CS is safe to poke into an interrupt gate. + * + * When running with supervisor_mode_kernel enabled a direct trap + * to the guest OS cannot be used because the INT instruction will + * switch to the Xen stack and we need to swap back to the guest + * kernel stack before passing control to the system call entry point. */ - if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ) + if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) || + supervisor_mode_kernel ) { v->arch.int80_desc.a = v->arch.int80_desc.b = 0; return; @@ -278,8 +284,8 @@ long do_set_callbacks(unsigned long event_selector, { struct vcpu *d = current; - fixup_guest_selector(event_selector); - fixup_guest_selector(failsafe_selector); + fixup_guest_code_selector(event_selector); + fixup_guest_code_selector(failsafe_selector); d->arch.guest_context.event_callback_cs = event_selector; d->arch.guest_context.event_callback_eip = event_address; @@ -289,12 +295,51 @@ long do_set_callbacks(unsigned long event_selector, return 0; } -void hypercall_page_initialise(void *hypercall_page) +static void hypercall_page_initialise_ring0_kernel(void *hypercall_page) +{ + extern asmlinkage int hypercall(void); + char *p; + int i; + + /* Fill in all the transfer points with template machine code. */ + + for ( i = 0; i < NR_hypercalls; i++ ) + { + p = (char *)(hypercall_page + (i * 32)); + + *(u8 *)(p+ 0) = 0x9c; /* pushf */ + *(u8 *)(p+ 1) = 0xfa; /* cli */ + *(u8 *)(p+ 2) = 0xb8; /* mov $,%eax */ + *(u32 *)(p+ 3) = i; + *(u8 *)(p+ 7) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */ + *(u32 *)(p+ 8) = (u32)&hypercall; + *(u16 *)(p+12) = (u16)__HYPERVISOR_CS; + *(u8 *)(p+14) = 0xc3; /* ret */ + } + + /* + * HYPERVISOR_iret is special because it doesn't return and expects a + * special stack frame. Guests jump at this transfer point instead of + * calling it. + */ + p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32)); + *(u8 *)(p+ 0) = 0x50; /* push %eax */ + *(u8 *)(p+ 1) = 0x9c; /* pushf */ + *(u8 *)(p+ 2) = 0xfa; /* cli */ + *(u8 *)(p+ 3) = 0xb8; /* mov $,%eax */ + *(u32 *)(p+ 4) = __HYPERVISOR_iret; + *(u8 *)(p+ 8) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */ + *(u32 *)(p+ 9) = (u32)&hypercall; + *(u16 *)(p+13) = (u16)__HYPERVISOR_CS; +} + +static void hypercall_page_initialise_ring1_kernel(void *hypercall_page) { char *p; int i; /* Fill in all the transfer points with template machine code. */ + for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { p = (char *)(hypercall_page + (i * 32)); @@ -316,6 +361,14 @@ void hypercall_page_initialise(void *hypercall_page) *(u16 *)(p+ 6) = 0x82cd; /* int $0x82 */ } +void hypercall_page_initialise(void *hypercall_page) +{ + if ( supervisor_mode_kernel ) + hypercall_page_initialise_ring0_kernel(hypercall_page); + else + hypercall_page_initialise_ring1_kernel(hypercall_page); +} + /* * Local variables: * mode: C diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index 4770a0436a..855a6510f2 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -228,7 +228,7 @@ long subarch_memory_op(int op, void *arg) long do_stack_switch(unsigned long ss, unsigned long esp) { - fixup_guest_selector(ss); + fixup_guest_stack_selector(ss); current->arch.guest_context.kernel_ss = ss; current->arch.guest_context.kernel_sp = esp; return 0; @@ -315,7 +315,7 @@ int check_descriptor(struct desc_struct *d) /* Validate and fix up the target code selector. */ cs = a >> 16; - fixup_guest_selector(cs); + fixup_guest_code_selector(cs); if ( !guest_gate_selector_okay(cs) ) goto bad; a = d->a = (d->a & 0xffffU) | (cs << 16); diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index e7e725f110..7fc91e88e7 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -170,6 +170,13 @@ long do_dom0_op(struct dom0_op *u_dom0_op) cpumask_t cpu_exclude_map; static domid_t rover = 0; + /* + * Running the domain 0 kernel in ring 0 is not compatible + * with multiple guests. + */ + if ( supervisor_mode_kernel ) + return -EINVAL; + dom = op->u.createdomain.domain; if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) ) { diff --git a/xen/common/kernel.c b/xen/common/kernel.c index 56f2c21abf..1d4de4ecc4 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -195,6 +195,8 @@ long do_xen_version(int cmd, void *arg) (1U << XENFEAT_writable_page_tables) | (1U << XENFEAT_auto_translated_physmap) | (1U << XENFEAT_pae_pgdir_above_4gb); + if ( supervisor_mode_kernel ) + fi.submap |= 1U << XENFEAT_supervisor_mode_kernel; break; default: return -EINVAL; diff --git a/xen/include/asm-ia64/config.h b/xen/include/asm-ia64/config.h index 85639685d6..90111f1e94 100644 --- a/xen/include/asm-ia64/config.h +++ b/xen/include/asm-ia64/config.h @@ -40,6 +40,8 @@ //leave SMP for a later time //#undef CONFIG_SMP +#define supervisor_mode_kernel (0) + #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */ #ifndef __ASSEMBLY__ diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index 502a136fd8..b901ed3036 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -37,6 +37,12 @@ #define NR_CPUS 32 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL +# define supervisor_mode_kernel (1) +#else +# define supervisor_mode_kernel (0) +#endif + /* Linkage for x86 */ #define __ALIGN .align 16,0x90 #define __ALIGN_STR ".align 16,0x90" diff --git a/xen/include/asm-x86/desc.h b/xen/include/asm-x86/desc.h index 7208c04ce0..f7d60fae61 100644 --- a/xen/include/asm-x86/desc.h +++ b/xen/include/asm-x86/desc.h @@ -27,10 +27,23 @@ #endif /* Fix up the RPL of a guest segment selector. */ -#define fixup_guest_selector(sel) \ +#define __fixup_guest_selector(sel) \ ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) : \ (((sel) & ~3) | GUEST_KERNEL_RPL)) +/* Stack selectors don't need fixing up if the kernel runs in ring 0. */ +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL +#define fixup_guest_stack_selector(ss) ((void)0) +#else +#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss) +#endif + +/* + * Code selectors are always fixed up. It allows the Xen exit stub to detect + * return to guest context, even when the guest kernel runs in ring 0. + */ +#define fixup_guest_code_selector(cs) __fixup_guest_selector(cs) + /* * We need this function because enforcing the correct guest kernel RPL is * unsufficient if the selector is poked into an interrupt, trap or call gate. diff --git a/xen/include/asm-x86/x86_32/asm_defns.h b/xen/include/asm-x86/x86_32/asm_defns.h index b008722d8a..c1e5e54867 100644 --- a/xen/include/asm-x86/x86_32/asm_defns.h +++ b/xen/include/asm-x86/x86_32/asm_defns.h @@ -48,11 +48,26 @@ #ifdef PERF_COUNTERS #define PERFC_INCR(_name,_idx) \ - lock incl perfcounters+_name(,_idx,4) + lock incl perfcounters+_name(,_idx,4) #else #define PERFC_INCR(_name,_idx) #endif +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL +#define FIXUP_RING0_GUEST_STACK \ + testl $2,8(%esp); \ + jnz 1f; /* rings 2 & 3 permitted */ \ + testl $1,8(%esp); \ + jz 2f; \ + ud2; /* ring 1 should not be used */ \ + 2:cmpl $(__HYPERVISOR_VIRT_START),%esp; \ + jge 1f; \ + call fixup_ring0_guest_stack; \ + 1: +#else +#define FIXUP_RING0_GUEST_STACK +#endif + #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v) #define XBUILD_SMP_INTERRUPT(x,v) \ asmlinkage void x(void); \ @@ -61,6 +76,7 @@ __asm__( \ ".globl " STR(x) "\n\t" \ STR(x) ":\n\t" \ "pushl $"#v"<<16\n\t" \ + STR(FIXUP_RING0_GUEST_STACK) \ STR(SAVE_ALL(a)) \ "movl %esp,%eax\n\t" \ "pushl %eax\n\t" \ @@ -72,6 +88,7 @@ __asm__( \ __asm__( \ "\n" __ALIGN_STR"\n" \ "common_interrupt:\n\t" \ + STR(FIXUP_RING0_GUEST_STACK) \ STR(SAVE_ALL(a)) \ "movl %esp,%eax\n\t" \ "pushl %eax\n\t" \ -- 2.30.2